DistributedWebDBWriter.java example

Explorer
damp.ekeko.snippets-master
- damp.ekeko.snippets.plugin
  - src
    - damp
      - ekeko
        snippets
        BoundDirective.java
        DirectiveOperandBinding.java
        EkekoSnippetsPlugin.java
        ExtractedSnippet.java
        NaiveASTFlattener.java
        OperatorOperandBinding.java
        SnippetBaseListener.java
        SnippetBaseVisitor.java
        SnippetExtractor.java
        SnippetLexer.java
        SnippetListener.java
        SnippetParser.java
        SnippetVisitor.java
        data
        SnippetOperator.java
        TemplateGroup.java
        geneticsearch
        PartialJavaProjectModel.java
        gui
        BoundDirectivesEditorDialog.java
        BoundDirectivesViewer.java
        ChartCanvas.java
        ClojureFileEditorInput.java
        DirectiveOperandBindingEditingSupport.java
        DirectiveOperandBindingLabelProviderValue.java
        DirectiveSelectionDialog.java
        IntendedResultsEditor.java
        IntendedResultsEditorCommandHandler.java
        IntendedResultsEditorInput.java
        IntendedResultsEditorPersistableElementFactory.java
        MutationHistoryDialog.java
        OperandBindingLabelProviderDescription.java
        OperatorOperandBindingEditingSupport.java
        OperatorOperandBindingLabelProviderValue.java
        OperatorOperandsView.java
        OperatorOperandsViewer.java
        OperatorTreeContentProvider.java
        OperatorTreeLabelProvider.java
        PopulationInspectorDialog.java
        QueryInspectorDialog.java
        RecommendationEditor.java
        RecommendationEditorCommandHandler.java
        RecommendationEditorInput.java
        RecommendationEditorPersistableElementFactory.java
        RewritesTemplateEditor.java
        SubjectsTemplateEditor.java
        TemplateCodeGenerator.java
        TemplateEditor.java
        TemplateEditorActionBarContributor.java
        TemplateEditorCommandHandler.java
        TemplateEditorInput.java
        TemplateEditorPersistableElementFactory.java
        TemplateGroupNodeSelectionDialog.java
        TemplateGroupTemplateElement.java
        TemplateGroupViewer.java
        TemplateGroupViewerNodeDoubleClickListener.java
        TemplateGroupViewerNodeSelectionEvent.java
        TemplateGroupViewerNodeSelectionListener.java
        TemplatePrettyPrinter.java
        TemplateTreeContentProvider.java
        TemplateTreeLabelProviders.java
        TransformationEditor.java
        TransformationEditorActionBarContributor.java
        TransformationEditorCommandHandler.java
        TransformationEditorInput.java
        TransformationEditorPersistableElementFactory.java
        TransformationOverviewEditor.java
    - ec
      - util
        MersenneTwister.java
- damp.ekeko.snippets.plugin.test
  - resources
  - src
    - test
      - damp
        ekeko
        snippets
        EkekoSnippetsTest.java
        experiments
        GeneticSearchTest.java
/* Copyright (c) 2003 The Nutch Organization.  All rights reserved.   */
/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */

package net.nutch.db;

import java.io.*;
import java.util.*;
import java.util.logging.*;
import java.nio.channels.*;

import net.nutch.io.*;
import net.nutch.util.*;
import net.nutch.pagedb.*;
import net.nutch.linkdb.*;

/***************************************************
 * This is a wrapper class that allows us to reorder
 * write operations to the linkdb and pagedb.  It is
 * useful only for objects like UpdateDatabaseTool,
 * which just does writes.
 *
 * The WebDBWriter is a traditional single-pass database writer.
 * It does not cache any instructions to disk (but it does
 * in memory, with possible resorting).  It certainly does
 * nothing in a distributed fashion.
 *
 * There are other implementors of IWebDBWriter that do
 * all that fancy stuff.
 *
 * @author Mike Cafarella
 *************************************************/
public class DistributedWebDBWriter implements IWebDBWriter {
    static final Logger LOG = LogFormatter.getLogger("net.nutch.db.WebDBWriter");
    static final byte CUR_VERSION = 0;
    static final byte OPEN_COUNTER_VERSION = 0;
    static final byte CLOSE_COUNTER_VERSION = 0;
    static final byte MACHINE_INFO_VERSION = 0;

    // magic number
    static int READY_TO_USE = 0xbabecafe;
    static int IS_COMPLETE = 0xbabe0000;
    static int WRITE_LOCK_INFO = 0xcafe0000;
    static long LONG_TIMEOUT = 10 * 1000;

    // db opcodes
    static final byte ADD_PAGE = 0;
    static final byte ADD_PAGE_WITH_SCORE = 1;
    static final byte ADD_PAGE_IFN_PRESENT = 2;
    static final byte DEL_PAGE = 3;
    static final int ADD_LINK = 0;
    static final int DEL_LINK = 1;
    static final int DEL_SINGLE_LINK = 2;

    // filenames
    static final String PAGES_BY_URL = "pagesByURL";
    static final String PAGES_BY_MD5 = "pagesByMD5";
    static final String LINKS_BY_URL = "linksByURL";
    static final String LINKS_BY_MD5 = "linksByMD5";
    static final String STATS_FILE = "stats";
    static final String META_SHAREGROUP = "metashare";
    static final String METAINFO = "metainfo";

    // Result codes for page-url comparisons
    static final int NO_OUTLINKS = 0;
    static final int HAS_OUTLINKS = 1;
    static final int LINK_INVALID = 2;

    /********************************************
     * PageInstruction holds an operation over a Page.
     *********************************************/
    public static class PageInstruction implements WritableComparable {
        byte opcode;
        boolean hasLink;
        Page page;
        Link link;

        /**
         */
        public PageInstruction() {}

        /**
         */
        public PageInstruction(Page page, int opcode) {
            set(page, opcode);
        }

        /**
         */
        public PageInstruction(Page page, Link link, int opcode) {
            set(page, link, opcode);
        }

        /**
         * Init from another PageInstruction object.
         */
        public void set(PageInstruction that) {
            this.opcode = that.opcode;

            if (this.page == null) {
                this.page = new Page();
            }
            this.page.set(that.page);

            if (this.link == null) {
                this.link = new Link();
            }
            this.hasLink = that.hasLink;
            if (this.hasLink) {
                this.link.set(that.link);
            }
        }

        /**
         * Init PageInstruction with no Link
         */
        public void set(Page page, int opcode) {
            this.opcode = (byte) opcode;
            this.page = page;
            this.hasLink = false;
            this.link = null;
        }

        /**
         * Init PageInstruction with a Link
         */         
        public void set(Page page, Link link, int opcode) {
            this.opcode = (byte) opcode;
            this.page = page;
            this.hasLink = true;
            this.link = link;
        }

        //
        // WritableComparable
        //
        public int compareTo(Object o) {
            int pageResult = this.page.compareTo(((PageInstruction) o).page);
            if (pageResult != 0) {
                return pageResult;
            } else {
                return this.opcode - (((PageInstruction) o).opcode);
            }
        }
        public void write(DataOutput out) throws IOException {
            out.writeByte(opcode);
            page.write(out);
            out.writeByte(hasLink ? 1 : 0);
            if (hasLink) {
                link.write(out);
            }
        }
        public void readFields(DataInput in) throws IOException {
            opcode = in.readByte();
            if (page == null) {
                page = new Page();
            }
            page.readFields(in);
            
            if (link == null) {
                link = new Link();
            }
            hasLink = (1 == in.readByte());
            if (hasLink) {
                link.readFields(in);
            }
        }
        public Page getPage() {
            return page;
        }
        public Link getLink() {
            if (hasLink) {
                return link;
            } else {
                return null;
            }
        }
        public int getInstruction() {
            return opcode;
        }

        /**
         * Sorts the instruction first by Page, then by opcode.
         */
        public static class PageComparator extends WritableComparator {
            private static final Page.Comparator PAGE_COMPARATOR =
            new Page.Comparator();

            public PageComparator() { super(PageInstruction.class); }

            /** Optimized comparator. */
            public int compare(byte[] b1, int s1, int l1,
                               byte[] b2, int s2, int l2) {
                int opcode1 = b1[s1];
                int opcode2 = b2[s2];
                int c = PAGE_COMPARATOR.compare(b1, s1+1, l1-1, b2, s2+1, l2-1);
                if (c != 0)
                    return c;
                return opcode1 - opcode2;
            }
        }
 
        /*****************************************************
         * Sorts the instruction first by url, then by opcode.
         *****************************************************/
        public static class UrlComparator extends WritableComparator {
            private static final Page.UrlComparator PAGE_COMPARATOR =
            new Page.UrlComparator();

            public UrlComparator() { super(PageInstruction.class); }

            /**
             * We need to sort by ordered URLs.  First, we sort by
             * URL, then by opcode.
             */
            public int compare(WritableComparable a, WritableComparable b) {
                PageInstruction instructionA = (PageInstruction)a;
                PageInstruction instructionB = (PageInstruction)b;
                Page pageA = instructionA.getPage();
                Page pageB = instructionB.getPage();

                int result = pageA.getURL().compareTo(pageB.getURL());
                if (result != 0) {
                    return result;
                } else {
                    return instructionA.opcode - instructionB.opcode;
                }
            }

            /** 
             * Optimized comparator. 
             */
            public int compare(byte[] b1, int s1, int l1,
                               byte[] b2, int s2, int l2) {
                int opcode1 = b1[s1];
                int opcode2 = b2[s2];
                int c = PAGE_COMPARATOR.compare(b1, s1+1, l1-1, b2, s2+1, l2-1);
                if (c != 0)
                    return c;
                return opcode1 - opcode2;
            }
        }
    }

    /********************************************************
     * PageInstructionWriter very efficiently writes a 
     * PageInstruction to an EditSectionGroupWriter.  Much better
     * than calling "writer.append(new PageInstruction())"
     ********************************************************/
    public static class PageInstructionWriter {
        PageInstruction pi = new PageInstruction();

        /**
         */
        public PageInstructionWriter() {
        }

        /**
         * Append the PageInstruction info to the indicated SequenceFile,
         * and keep the PI for later reuse.
         */
        public synchronized void appendInstructionInfo(EditSectionGroupWriter writer, Page page, int opcode, Writable val) throws IOException {
            pi.set(page, opcode);
            writer.append(pi, val);
        }

        /**
         * Append the PageInstruction info to the indicated SequenceFile,
         * and keep the PI for later reuse.
         */
        public synchronized void appendInstructionInfo(EditSectionGroupWriter writer, Page page, Link link, int opcode, Writable val) throws IOException {
            pi.set(page, link, opcode);
            writer.append(pi, val);
        }
    }

    /*************************************************************
     * Reduce multiple instructions for a given url to the single effective
     * instruction.  ADD is prioritized highest, then ADD_IFN_PRESENT, and then
     * DEL.  Not coincidentally, this is opposite the order they're sorted in.
     **************************************************************/
    private static class DeduplicatingPageSequenceReader {
        SequenceFile.Reader edits;
        PageInstruction current = new PageInstruction();
        UTF8 currentUrl = new UTF8();
        boolean haveCurrent;

        /**
         */
        public DeduplicatingPageSequenceReader(SequenceFile.Reader edits) throws IOException {
            this.edits = edits;
            this.haveCurrent = edits.next(current, NullWritable.get());
        }

        /**
         */
        public boolean next(PageInstruction result) throws IOException {
            if (!haveCurrent) {
                return false;
            }
        
            currentUrl.set(current.getPage().getURL());
            result.set(current); // take the first instruction

            do {
                // skip the rest
            } while ((haveCurrent = edits.next(current, NullWritable.get())) &&
                     currentUrl.compareTo(current.getPage().getURL()) == 0);
            return true;
        }
    }


    /*************************************************
     * Holds an instruction over a Link.
     *************************************************/
    public static class LinkInstruction implements WritableComparable {
        Link link;
        int instruction;

        /**
         */
        public LinkInstruction() {
        }

        /**
         */
        public LinkInstruction(Link link, int instruction) {
            set(link, instruction);
        }

        /**
         * Re-init from another LinkInstruction's info.
         */
        public void set(LinkInstruction that) {
            this.instruction = that.instruction;
          
            if (this.link == null)
                this.link = new Link();

            this.link.set(that.link);
        }

        /**
         * Re-init with a Link and an instruction
         */
        public void set(Link link, int instruction) {
            this.link = link;
            this.instruction = instruction;
        }

        //
        // WritableComparable
        //
        public int compareTo(Object o) {
            return this.link.compareTo(((LinkInstruction) o).link);
        }
        public void write(DataOutput out) throws IOException {
            out.writeByte(instruction);
            link.write(out);
        }
        public void readFields(DataInput in) throws IOException {
            this.instruction = in.readByte();
            if (link == null)
                link = new Link();
            link.readFields(in);
        }
        public Link getLink() {
            return link;
        }
        public int getInstruction() {
            return instruction;
        }

        /*******************************************************
         * Sorts the instruction first by Md5, then by opcode.
         *******************************************************/
        public static class MD5Comparator extends WritableComparator {
            private static final Link.MD5Comparator MD5_COMPARATOR =
            new Link.MD5Comparator();

            public MD5Comparator() { super(LinkInstruction.class); }

            public int compare(WritableComparable a, WritableComparable b) {
                LinkInstruction instructionA = (LinkInstruction)a;
                LinkInstruction instructionB = (LinkInstruction)b;
                return instructionA.link.md5Compare(instructionB.link);
            }

            /** Optimized comparator. */
            public int compare(byte[] b1, int s1, int l1,
                               byte[] b2, int s2, int l2) {
                return MD5_COMPARATOR.compare(b1, s1+1, l1-1, b2, s2+1, l2-1);
            }
        }
 
        /*********************************************************
         * Sorts the instruction first by url, then by opcode.
         *********************************************************/
        public static class UrlComparator extends WritableComparator {
            private static final Link.UrlComparator URL_COMPARATOR =
            new Link.UrlComparator();

            public UrlComparator() { super(LinkInstruction.class); }

            public int compare(WritableComparable a, WritableComparable b) {
                LinkInstruction instructionA = (LinkInstruction)a;
                LinkInstruction instructionB = (LinkInstruction)b;
                return instructionA.link.urlCompare(instructionB.link);

            }

            /** 
             * Optimized comparator. 
             */
            public int compare(byte[] b1, int s1, int l1,
                               byte[] b2, int s2, int l2) {
                return URL_COMPARATOR.compare(b1, s1+1, l1-1, b2, s2+1, l2-1);
            }
        }
    }

    /*******************************************************
     * LinkInstructionWriter very efficiently writes a
     * LinkInstruction to an EditSectionGroupWriter.  Much better
     * than calling "writer.append(new LinkInstruction())"
     ********************************************************/
    public static class LinkInstructionWriter {
        LinkInstruction li = new LinkInstruction();

        /**
         */
        public LinkInstructionWriter() {
        }

        /**
         * Append the LinkInstruction info to the indicated SequenceFile
         * and keep the LI for later reuse.
         */
        public synchronized void appendInstructionInfo(EditSectionGroupWriter writer, Link link, int opcode, Writable val) throws IOException {
            li.set(link, opcode);
            writer.append(li, val);
        }
    }

    /********************************************************
     * This class deduplicates link operations.  We want to 
     * sort by MD5, then by URL.  But all operations
     * should be unique.
     *********************************************************/
    class DeduplicatingLinkSequenceReader {
        Link currentKey = new Link();
        LinkInstruction current = new LinkInstruction();
        SequenceFile.Reader edits;
        boolean haveCurrent;

        /**
         */
        public DeduplicatingLinkSequenceReader(SequenceFile.Reader edits) throws IOException {
            this.edits = edits;
            this.haveCurrent = edits.next(current, NullWritable.get());
        }


        /**
         * The incoming stream of edits is sorted first by MD5, then by URL.
         * MD5-only values always come before MD5+URL.
         */
        public boolean next(LinkInstruction key) throws IOException {
            if (! haveCurrent) {
                return false;
            }

            currentKey.set(current.getLink());
            
            do {
                key.set(current);
            } while ((haveCurrent = edits.next(current, NullWritable.get())) &&
                     currentKey.compareTo(current.getLink()) == 0);
            return true;
        }
    }


    /**************************************************
     * The CloseProcessor class is used when we close down
     * the webdb.  We give it the path, members, and class values
     * needed to apply changes to any of our 4 data tables.
     * 
     * This is an abstract class.  Each subclass must define
     * the exact merge procedure.  However, file-handling
     * and edit-processing is standardized as much as possible.
     *
     **************************************************/
    private abstract class CloseProcessor {
        String basename;
        String curDBPart;
        MapFile.Reader oldDb;
        EditSectionGroupWriter editWriter;
        SequenceFile.Sorter sorter;
        WritableComparator comparator;
        Class keyClass, valueClass;
        long itemsWritten = 0;

        /**
         * Store away these members for later use.
         */
        CloseProcessor(String basename, MapFile.Reader oldDb, EditSectionGroupWriter editWriter, SequenceFile.Sorter sorter, WritableComparator comparator, Class keyClass, Class valueClass, String curDBPart) {
            this.basename = basename;
            this.oldDb = oldDb;
            this.editWriter = editWriter;
            this.sorter = sorter;
            this.comparator = comparator;
            this.keyClass = keyClass;
            this.valueClass = valueClass;
            this.curDBPart = curDBPart;
        }

        /**
         * Perform the shutdown sequence for this Processor.
         * There is a lot of file-moving and edit-sorting that
         * is common across all the 4 tables.
         *
         * Returns how many items were written out by this close().
         */
        long closeDown(NutchFile workingDir, NutchFile outputDir) throws IOException {
            //
            // Done adding edits, so close edit-writer.
            //
            editWriter.close();

            //
            // Where the output is going
            //
            NutchFile sectionDir = new NutchFile(outputDir, "dbsection." + machineNum);
            NutchFile newDbNF = new NutchFile(sectionDir, basename);

            //
            // Grab all the edits that we need to process.  We build an EditSectionGroupReader
            // and aim it at the right location.  The ESR will wait until all its
            // component Sections are written and completed before returning from
            // any method (other than the constructor).  So we expect to possibly wait
            // inside the call to numEdits().
            //
            EditSectionGroupReader edits = new EditSectionGroupReader(nutchfs, dbName, basename, machineNum, totalMachines);
            int numEdits = edits.numEdits();

            // If there are edits, then process them.
            if (numEdits != 0) {
                File mergedEditsFile = edits.mergeSectionComponents();
                File sortedEditsFile = new File(mergedEditsFile.getPath() + ".sorted");

                // Sort the edits
                long startSort = System.currentTimeMillis();
                sorter.sort(mergedEditsFile.getPath(), sortedEditsFile.getPath());
                long endSort = System.currentTimeMillis();

                LOG.info("Processing " + basename + ": Sorted " + numEdits + " instructions in " + ((endSort - startSort) / 1000.0) + " seconds.");
                LOG.info("Processing " + basename + ": Sorted " + (numEdits / ((endSort - startSort) / 1000.0)) + " instructions/second");
            
                // Delete old file
                mergedEditsFile.delete();

                // Read the sorted edits.  That means read all
                // the edits from the local subsection of the
                // database.  We must merge every machine's
                // contribution to the edit-list first (which
                // also means waiting until each machine has
                // completed that step).

                // Read the sorted edits
                SequenceFile.Reader sortedEdits = new SequenceFile.Reader(sortedEditsFile.getPath());

                // Create a brand-new output db for the integrated data
                File newDbFile = nutchfs.getWorkingFile();
                MapFile.Writer newDb = (comparator == null) ? new MapFile.Writer(newDbFile.getPath(), keyClass, valueClass) : new MapFile.Writer(newDbFile.getPath(), comparator, valueClass);

                // Iterate through the edits, and merge changes with existing
                // db into the brand-new file
                oldDb.reset();
            
                // Merge the edits.  We did it!
                long startMerge = System.currentTimeMillis();
                mergeEdits(oldDb, sortedEdits, newDb);
                long endMerge = System.currentTimeMillis();
                LOG.info("Processing " + basename + ": Merged to new DB containing " + itemsWritten + " records in " + ((endMerge - startMerge) / 1000.0) + " seconds");
                LOG.info("Processing " + basename + ": Merged " + (itemsWritten / ((endMerge - startMerge) / 1000.0)) + " records/second");

                // Close down readers, writers
                sortedEdits.close();
                newDb.close();

                // Delete the (sorted) merged-edits
                sortedEditsFile.delete();

                // Store the newly-written db file
                nutchfs.put(newDbNF, newDbFile, true);
            } else {
                // Otherwise, simply copy the original file into place,
                // without all the processing overhead.
                long startCopy = System.currentTimeMillis();

                NutchFile srcSectionDir = new NutchFile(dbDir, "dbsection." + machineNum);
                NutchFile srcDbNF = new NutchFile(srcSectionDir, basename);
                File srcDbFile = nutchfs.get(srcDbNF);

                nutchfs.put(newDbNF, srcDbFile, true);

                long endCopy = System.currentTimeMillis();
                LOG.info("Processing " + basename + ": Copied file (" + srcDbFile.length()+ " bytes) in " + ((endCopy - startCopy) / 1000.0) + " secs.");
            }

            // Delete the now-consumed edits file to save space
            edits.delete();

            return itemsWritten;
        }

        /**
         * The loop that actually applies the changes and writes to
         * a new db.  This is different for every subclass!
         */
        abstract void mergeEdits(MapFile.Reader db, SequenceFile.Reader edits, MapFile.Writer newDb) throws IOException;
    }

    /***
     * The PagesByURLProcessor is used during close() time for
     * the pagesByURL table.  We instantiate one of these, and it
     * takes care of the entire shutdown process.
     */
    private class PagesByURLProcessor extends CloseProcessor {
        EditSectionGroupWriter futureEdits;

        /**
         * We store "futureEdits" so we can write edits for the
         * next table-db step
         */
        PagesByURLProcessor(MapFile.Reader db, EditSectionGroupWriter editWriter, EditSectionGroupWriter futureEdits) {
            super(PAGES_BY_URL, db, editWriter, new SequenceFile.Sorter(new PageInstruction.UrlComparator(), NullWritable.class), new UTF8.Comparator(), null, Page.class, "PagesByURLPart");
            this.futureEdits = futureEdits;
        }

        /**
         * Merge the existing db with the edit-stream into a brand-new file.
         */
        void mergeEdits(MapFile.Reader db, SequenceFile.Reader sortedEdits, MapFile.Writer newDb) throws IOException {
            // Create the keys and vals we'll be using
            DeduplicatingPageSequenceReader edits = new DeduplicatingPageSequenceReader(sortedEdits);
            WritableComparable readerKey = new UTF8();
            Page readerVal = new Page();
            PageInstruction editItem = new PageInstruction();
            int futureOrdering = 0;

            // Read the first items from both streams
            boolean hasEntries = db.next(readerKey, readerVal);
            boolean hasEdits = edits.next(editItem);

            // As long as we have both edits and entries, we need to
            // interleave them....
            while (hasEntries && hasEdits) {
                int comparison = readerKey.compareTo(editItem.getPage().getURL());
                int curInstruction = editItem.getInstruction();

                // Perform operations
                if ((curInstruction == ADD_PAGE) ||
                    (curInstruction == ADD_PAGE_WITH_SCORE) ||
                    (curInstruction == ADD_PAGE_IFN_PRESENT)) {

                    if (comparison < 0) {
                        // Write readerKey, just passing it along.
                        // Don't process the edit yet.
                        newDb.append(readerKey, readerVal);
                        itemsWritten++;
                        hasEntries = db.next(readerKey, readerVal);
                    } else if (comparison == 0) {
                        // The keys are equal.  If the instruction 
                        // is ADD_PAGE, we write the edit's key and 
                        // replace the old one.
                        //
                        // Otherwise, if it's ADD_IFN_PRESENT, 
                        // keep the reader's item intact.
                        //
                        if ((curInstruction == ADD_PAGE) ||
                            (curInstruction == ADD_PAGE_WITH_SCORE)) {
                            // An ADD_PAGE with an identical pair
                            // of pages replaces the existing one.
                            // We may need to note the fact for
                            // Garbage Collection.
                            //
                            // This happens in three stages.  
                            // 1.  We write necessary items to the future
                            //     edits-list.
                            //
                            pagesByMD5Edits++;

                            // If this is a replacing add, we don't want
                            // to disturb the score from the old Page!  This,
                            // way, we can run some link analysis scoring
                            // while the new Pages are being fetched and
                            // not lose the info when a Page is replaced.
                            //
                            // If it is an ADD_PAGE_WITH_SCORE, then we 
                            // go ahead and replace the old one.
                            //
                            // Either way, from now on we treat it
                            // as an ADD_PAGE
                            //
                            Page editItemPage = editItem.getPage();

                            if (curInstruction == ADD_PAGE) {
                                editItemPage.setScore(readerVal.getScore(), readerVal.getNextScore());
                            }

                            piwriter.appendInstructionInfo(futureEdits, editItemPage, ADD_PAGE, NullWritable.get());

                            //
                            // 2.  We write the edit-page to *this* table.
                            //
                            newDb.append(editItemPage.getURL(), editItemPage);

                            //
                            // 3.  We want the ADD in the next step (the
                            //     MD5-driven table) to be a "replacing add".
                            //     But that won't happen if the readerItem and
                            //     the editItem Pages are not identical.
                            //     (In this scenario, that means their URLs
                            //     are the same, but their MD5s are different.)
                            //     So, we need to explicitly handle that
                            //     case by issuing a DELETE for the now-obsolete
                            //     item.
                            if (editItemPage.compareTo(readerVal) != 0) {
                                pagesByMD5Edits++;
                                piwriter.appendInstructionInfo(futureEdits, readerVal, DEL_PAGE, NullWritable.get());
                            }

                            itemsWritten++;

                            // "Delete" the readerVal by skipping it.
                            hasEntries = db.next(readerKey, readerVal);
                        } else {
                            // ADD_PAGE_IFN_PRESENT.  We only add IF_NOT
                            // present.  And it was present!  So, we treat 
                            // this case like we treat a no-op.
                            // Just move to the next edit.
                        }
                        // In either case, we process the edit.
                        hasEdits = edits.next(editItem);

                    } else if (comparison > 0) {
                        // We have inserted a Page that's before some
                        // entry in the existing database.  So, we just
                        // need to write down the Page from the Edit file.
                        // It's like the above case, except we don't tell
                        // the future-edits to delete anything.
                        //
                        // 1.  Write the item down for the future.
                        pagesByMD5Edits++;

                        //
                        // If this is an ADD_PAGE_IFN_PRESENT, then
                        // we may also have a Link we have to take care of!
                        //
                        if (curInstruction == ADD_PAGE_IFN_PRESENT) {
                            Link editLink = editItem.getLink();
                            if (editLink != null) {
                                addLink(editLink);
                            }
                        }
                        piwriter.appendInstructionInfo(futureEdits, editItem.getPage(), ADD_PAGE, NullWritable.get());

                        //
                        // 2.  Write the edit-page to *this* table
                        newDb.append(editItem.getPage().getURL(), editItem.getPage());
                        itemsWritten++;

                        // Process the edit
                        hasEdits = edits.next(editItem);
                    }
                } else if (curInstruction == DEL_PAGE) {
                    if (comparison < 0) {
                        // Write the readerKey, just passing it along.
                        // We don't process the edit yet.
                        newDb.append(readerKey, readerVal);
                        itemsWritten++;
                        hasEntries = db.next(readerKey, readerVal);
                    } else if (comparison == 0) {
                        // Delete it!  We can only delete one item
                        // at a time, as all URLs are unique.
                        // 1.  Tell the future-edits what page will need to
                        //     be deleted.
                        pagesByMD5Edits++;
                        piwriter.appendInstructionInfo(futureEdits, readerVal, DEL_PAGE, NullWritable.get());

                        //
                        // 2.  "Delete" the entry by skipping the Reader
                        //     key.
                        hasEntries = db.next(readerKey, readerVal);

                        // Process the edit
                        hasEdits = edits.next(editItem);
                    } else if (comparison > 0) {
                        // Ignore it.  We tried to delete an item that's
                        // not here.
                        hasEdits = edits.next(editItem);
                    }
                }
            }

            // Now we have only edits.  No more preexisting items!
            while (! hasEntries && hasEdits) {
                int curInstruction = editItem.getInstruction();
                if (curInstruction == ADD_PAGE ||
                    curInstruction == ADD_PAGE_WITH_SCORE ||
                    curInstruction == ADD_PAGE_IFN_PRESENT) {
                    // No more reader entries, so ADD_PAGE_IFN_PRESENT
                    // is treated like a simple ADD_PAGE.

                    // 1.  Tell the future edits-list about this new item
                    pagesByMD5Edits++;
                    piwriter.appendInstructionInfo(futureEdits, editItem.getPage(), ADD_PAGE, NullWritable.get());

                    // 2.  Write the edit page to this table.
                    newDb.append(editItem.getPage().getURL(), editItem.getPage());
                    itemsWritten++;
                } else if (curInstruction == DEL_PAGE) {
                    // Ignore it.  We tried to delete an item
                    // that's not here.
                }

                // Either way, we always process the edit.
                hasEdits = edits.next(editItem);
            }

            // Now we have only preexisting items.  We just copy
            // them to the new file, in order.
            while (hasEntries && ! hasEdits) {
                newDb.append(readerKey, readerVal);
                itemsWritten++;
                hasEntries = db.next(readerKey, readerVal);
            }
        }
    }

    /***
     * The PagesByMD5Processor is used during close() time for
     * the pagesByMD5 table.  We instantiate one of these, and it
     * takes care of the entire shutdown process.
     */
    private class PagesByMD5Processor extends CloseProcessor {
        /**
         */
        PagesByMD5Processor(MapFile.Reader db, EditSectionGroupWriter editWriter) {
            super(PAGES_BY_MD5, db, editWriter, new SequenceFile.Sorter(new PageInstruction.PageComparator(), NullWritable.class), null, Page.class, NullWritable.class, "PagesByMD5Part");
        }

        /**
         */
        void mergeEdits(MapFile.Reader db, SequenceFile.Reader sortedEdits, MapFile.Writer newDb) throws IOException {
            // Create the keys and vals
            Page readerItem = new Page();
            PageInstruction editItem = new PageInstruction();

            // For computing the GC list
            Page deletedItem = new Page(), lastItem = new Page();
            boolean justDeletedItem = false;
            boolean newReaderItem = false;
            int itemRepeats = 0;

            // Read the first items from both streams
            boolean hasEntries = db.next(readerItem, NullWritable.get());
            boolean hasEdits = sortedEdits.next(editItem, NullWritable.get());
            if (hasEntries) {
                // The first thing we read should become
                // the "previous key".  We need this for
                // garbage collection.
                outBuf.reset();
                readerItem.write(outBuf);
                inBuf.reset(outBuf.getData(), outBuf.getLength());
                lastItem.readFields(inBuf);
                itemRepeats = 0;
            }

            // As long we have both edits and entries, we need to
            // interleave them.
            while (hasEdits && hasEntries) {
                int comparison = readerItem.compareTo(editItem.getPage());
                int curInstruction = editItem.getInstruction();

                //
                // OK!  Now perform operations
                //
                if (curInstruction == ADD_PAGE) {
                    if (comparison < 0) {
                        // Write readerItem, just passing it along.
                        // Don't process the edit yet.
                        newDb.append(readerItem, NullWritable.get());
                        itemsWritten++;
                        hasEntries = db.next(readerItem, NullWritable.get());
                        newReaderItem = true;
                    } else if (comparison == 0) {
                        //
                        // This is a "replacing ADD", which is generated
                        // by the above-sequence.  We should skip over the
                        // existing item, and add the new one instead.
                        //
                        // Note that by this point, the new version of the
                        // Page from the edit sequence is guaranteed to
                        // have the correct score.  We make sure of it in
                        // the mergeEdits() for PagesByURLProcessor.
                        //
                        newDb.append(editItem.getPage(), NullWritable.get());
                        itemsWritten++;
                        hasEntries = db.next(readerItem, NullWritable.get());
                        newReaderItem = true;
                        hasEdits = sortedEdits.next(editItem, NullWritable.get());
                    } else if (comparison > 0) {
                        // Write the edit item.  We've inserted an item
                        // that comes before any others.
                        newDb.append(editItem.getPage(), NullWritable.get());
                        itemsWritten++;
                        hasEdits = sortedEdits.next(editItem, NullWritable.get());
                    }
                } else if (curInstruction == ADD_PAGE_IFN_PRESENT) {
                    throw new IOException("Should never process ADD_PAGE_IFN_PRESENT for the index:  " + editItem);
                } else if (curInstruction == DEL_PAGE) {
                    if (comparison < 0) {
                        // Write the readerKey, just passing it along.
                        // Don't process the edit yet.
                        newDb.append(readerItem, NullWritable.get());
                        itemsWritten++;
                        hasEntries = db.next(readerItem, NullWritable.get());
                        newReaderItem = true;
                    } else if (comparison == 0) {
                        // Delete it!  Remember only one entry can
                        // be deleted at a time!
                        //
                        // "Delete" the entry by skipping over the reader
                        // item.  We move onto the next item in the existing
                        // index, as well as the next edit instruction.
                        hasEntries = db.next(readerItem, NullWritable.get());
                        newReaderItem = true;
                        hasEdits = sortedEdits.next(editItem, NullWritable.get());
                        
                        // We need to set this flag for GC'ing.
                        justDeletedItem = true;
                    } else if (comparison > 0) {
                        // This should never happen!  We should only be
                        // deleting items that actually appear!
                        throw new IOException("An unapplicable DEL_PAGE should never appear during index-merge: " + editItem);
                    }
                }

                // GARBAGE COLLECTION
                // We want to detect when we have deleted the 
                // last MD5 of a certain value.  We can have 
                // multiple MD5s in the same index, as long as
                // they have different URLs.  When the last MD5
                // is deleted, we want to know so we can modify
                // the LinkDB.
                if (newReaderItem) {
                    // If we have a different readerItem which is just
                    // the same as our last one, then we know it's a 
                    // repeat!
                    if (hasEntries && readerItem.getMD5().compareTo(lastItem.getMD5()) == 0) {
                        itemRepeats++;
                    } else {
                        // The current readerItem and the lastItem
                        // MD5s are not equal.
                        //
                        // If the last item was deleted, AND if the
                        // deleted item is not a repeat of the current item,
                        // then that MD5 should be garbage collected.
                        if (justDeletedItem && itemRepeats == 0) {
                            deleteLink(lastItem.getMD5());
                        }

                        // The current readerItem is the new "last key".
                        outBuf.reset();
                        readerItem.write(outBuf);
                        inBuf.reset(outBuf.getData(), outBuf.getLength());
                        lastItem.readFields(inBuf);
                        itemRepeats = 0;
                    }
                    // Clear "new-reader-item" bit
                    newReaderItem = false;
                }
                // Clear "last-deleted" bit
                justDeletedItem = false;
            }
        
            // Now we have only edits.  No more preexisting items!
            while (! hasEntries && hasEdits) {
                int curInstruction = editItem.getInstruction();
                if (curInstruction == ADD_PAGE) {
                    // Just write down the new page!
                    newDb.append(editItem.getPage(), NullWritable.get());
                    itemsWritten++;
                } else if (curInstruction == ADD_PAGE_IFN_PRESENT) {
                    throw new IOException("Should never process ADD_PAGE_IFN_PRESENT for the index:  " + editItem);
                } else if (curInstruction == DEL_PAGE) {
                    // This should never happen!  We should only be
                    // deleting items that actually appear!
                    throw new IOException("An unapplicable DEL_PAGE should never appear during index-merge: " + editItem);
                }
                hasEdits = sortedEdits.next(editItem, NullWritable.get());
            }

            // Now we have only preexisting items.  We just copy them
            // to the new file, in order
            while (hasEntries && ! hasEdits) {
                // Simply copy through the remaining database items
                newDb.append(readerItem, NullWritable.get());
                itemsWritten++;
                hasEntries = db.next(readerItem, NullWritable.get());
                newReaderItem = true;
            }
        }
    }

    /**
     * The LinksByMD5Processor is used during close() for
     * the pagesByMD5 table.  It processes all the edits to
     * this table, and also generates edits for the linksByURL
     * table.
     */
    private class LinksByMD5Processor extends CloseProcessor {
        EditSectionGroupWriter futureEdits;

        /**
         */
        public LinksByMD5Processor(MapFile.Reader db, EditSectionGroupWriter editWriter, EditSectionGroupWriter futureEdits) {
            super(LINKS_BY_MD5, db, editWriter, new SequenceFile.Sorter(new LinkInstruction.MD5Comparator(), NullWritable.class), new Link.MD5Comparator(), Link.class, NullWritable.class, "LinksByMD5Part");
            this.futureEdits = futureEdits;
        }

        /**
         * Merges edits into the md5-driven link table.  Also generates
         * edit sequence to apply to the URL-driven table.
         */
        void mergeEdits(MapFile.Reader db, SequenceFile.Reader sortedEdits, MapFile.Writer newDb) throws IOException {
            WritableComparator comparator = new Link.MD5Comparator();
            DeduplicatingLinkSequenceReader edits = new DeduplicatingLinkSequenceReader(sortedEdits);

            // Create the keys and vals we'll use
            LinkInstruction editItem = new LinkInstruction();
            Link readerItem = new Link();

            // Read the first items from both streams
            boolean hasEntries = db.next(readerItem, NullWritable.get());
            boolean hasEdits = edits.next(editItem);

            // As long as we have both edits and entries to process,
            // we need to interleave them
            while (hasEntries && hasEdits) {
                int curInstruction = editItem.getInstruction();

                // Perform operations
                if (curInstruction == ADD_LINK) {
                    // When we add a link, we may replace a previous
                    //   link with identical URL and MD5 values.  The 
                    //   MD5FirstComparator will use both values.
                    //
                    int comparison = comparator.compare(readerItem, editItem.getLink());

                    if (comparison < 0) {
                        // Write the readerKey, just passing it along.
                        //   Don't process the edit yet.
                        newDb.append(readerItem, NullWritable.get());
                        itemsWritten++;
                        hasEntries = db.next(readerItem, NullWritable.get());
                    } else if (comparison == 0) {
                        // 1.  Write down the item for table-edits
                        if (futureEdits != null) {
                            linksByURLEdits++;
                            liwriter.appendInstructionInfo(futureEdits, editItem.getLink(), ADD_LINK, NullWritable.get());
                        }

                        // 2.  Write the new item, "replacing" the old one.
                        //    We move to the next edit instruction and move
                        //    past the replaced db entry.
                        newDb.append(editItem.getLink(), NullWritable.get());
                        itemsWritten++;
                        hasEntries = db.next(readerItem, NullWritable.get());
                        hasEdits = edits.next(editItem);
                    } else if (comparison > 0) {
                        // 1.  Write down the item for table-edits
                        if (futureEdits != null) {
                            linksByURLEdits++;
                            liwriter.appendInstructionInfo(futureEdits, editItem.getLink(), ADD_LINK, NullWritable.get());
                        }

                        // 2.  Write the new item.  We stay at the current
                        //     db entry.
                        newDb.append(editItem.getLink(), NullWritable.get());
                        itemsWritten++;
                        hasEdits = edits.next(editItem);
                    }
                } else if ((curInstruction == DEL_LINK) ||
                           (curInstruction == DEL_SINGLE_LINK)) {
                    // When we delete a link, we might delete many
                    //   at once!  We are interested only in the MD5
                    //   here.  If there are entries with identical MD5
                    //   values, but different URLs, we get rid of them
                    //   all.
                    int comparison = 0;
                    if (curInstruction == DEL_LINK) {
                        comparison = readerItem.getFromID().compareTo(editItem.getLink().getFromID());
                    } else {
                        comparison = readerItem.md5Compare(editItem.getLink());
                    }

                    if (comparison < 0) {
                        // Write the readerKey, just passing it along.
                        //   Don't process the edit yet.
                        newDb.append(readerItem, NullWritable.get());
                        itemsWritten++;
                        hasEntries = db.next(readerItem, NullWritable.get());
                    } else if (comparison == 0) {
                        // Delete it (or them!)
                        // 1. Write the full instruction for the next
                        //    delete-stage.  That includes the read-in
                        //    value
                        // 2. "Delete" the entry by skipping the
                        //    readerKey.  We DO NOT go to the next edit 
                        //    instruction!  There might still be more 
                        //    entries in the database to which we should
                        //    apply this delete-edit.
                        //
                        // Step 1.  Write entry for future table-edits
                        if (futureEdits != null) {
                            linksByURLEdits++;
                            liwriter.appendInstructionInfo(futureEdits, readerItem, DEL_LINK, NullWritable.get());
                        }

                        // Step 2.
                        // We might want to delete multiple MD5s with
                        // a single delete() operation, so keep this
                        // edit instruction around
                        hasEntries = db.next(readerItem, NullWritable.get());
                        if (curInstruction == DEL_SINGLE_LINK) {
                            hasEdits = edits.next(editItem);
                        }
                    } else if (comparison > 0) {
                        // Ignore, move on to next instruction
                        hasEdits = edits.next(editItem);
                    }
                }
            }

            // Now we have only edits.  No more preexisting items!
            while (! hasEntries && hasEdits) {
                int curInstruction = editItem.getInstruction();

                if (curInstruction == ADD_LINK) {
                    // 1.  Write down the item for future table-edits
                    if (futureEdits != null) {
                        linksByURLEdits++;
                        liwriter.appendInstructionInfo(futureEdits, editItem.getLink(), ADD_LINK, NullWritable.get());
                    }

                    // 2.  Just add the item from the edit list
                    newDb.append(editItem.getLink(), NullWritable.get());
                    itemsWritten++;
                } else if (curInstruction == DEL_LINK) {
                    // Ignore operation
                }
                // Move on to next edit
                hasEdits = edits.next(editItem);
            }

            // Now we have only preexisting items.  Just copy them
            // to the new file, in order.
            while (hasEntries && ! hasEdits) {
                newDb.append(readerItem, NullWritable.get());
                itemsWritten++;
                hasEntries = db.next(readerItem, NullWritable.get());
            }
        }
    }

    /**
     * This class helps the LinksByURLProcessor test a list of
     * Page objects, sorted by URL, for outlink-counts.  We query
     * this class with a series of questions, based on Links sorted
     * by target URL.
     */
    private class TargetTester {
        MapFile.Reader pagedb;
        boolean hasPage = false;
        UTF8 pageURL = null;
        Page page = null;

        /**
         */
        public TargetTester(MapFile.Reader pagedb) throws IOException {
            this.pagedb = pagedb;
            this.pageURL = new UTF8();
            this.page = new Page();
            this.hasPage = pagedb.next(pageURL, page);
        }

        /**
         * Match the given URL against the sorted series of Page URLs.
         */
        public int hasOutlinks(UTF8 curURL) throws IOException {
            int returnCode = NO_OUTLINKS;
            int comparison = pageURL.compareTo(curURL);

            while (hasPage && comparison < 0) {
                hasPage = pagedb.next(pageURL, page);
                if (hasPage) {
                    comparison = pageURL.compareTo(curURL);
                }
            }

            if (hasPage) {
                if (comparison == 0) {
                    returnCode = (page.getNumOutlinks() > 0) ? HAS_OUTLINKS : NO_OUTLINKS;
                } else if (comparison > 0) {
                    //
                    // This situation indicates that the Link's 
                    // target page has been deleted, probably
                    // because we repeatedly failed to fetch the URL.
                    // So, we should delete the Link.
                    //
                    returnCode = LINK_INVALID;
                }
            }
            return returnCode;
        }

        /**
         */
        public void close() throws IOException {
            pagedb.close();
        }
    }

    /**
     * Closes down and merges changes to the URL-driven link
     * table.  This does nothing fancy, and propagates nothing
     * to a further stage.  There is no next stage!
     */
    private class LinksByURLProcessor extends CloseProcessor {
        MapFile.Reader pageDb;
        EditSectionGroupWriter futureEdits;

        /**
         */
        public LinksByURLProcessor(MapFile.Reader db, EditSectionGroupWriter editWriter, MapFile.Reader pageDb, EditSectionGroupWriter futureEdits) {
            super(LINKS_BY_URL, db, editWriter, new SequenceFile.Sorter(new LinkInstruction.UrlComparator(), NullWritable.class), new Link.UrlComparator(), Link.class, NullWritable.class, "LinksByURLPart");
            this.pageDb = pageDb;
            this.futureEdits = futureEdits;
        }

        /**
         */
        public long closeDown(NutchFile workingDir, NutchFile outputDir) throws IOException {
            long result = super.closeDown(workingDir, outputDir);
            pageDb.close();
            return result;
        }

        /**
         * Merge the existing db with the edit-stream into a brand-new file.
         */
        void mergeEdits(MapFile.Reader db, SequenceFile.Reader sortedEdits, MapFile.Writer newDb) throws IOException {
            WritableComparator comparator = new Link.UrlComparator();

            // Create the keys and vals we'll use
            LinkInstruction editItem = new LinkInstruction();
            Link readerItem = new Link();
        
            // Read the first items from both streams
            boolean hasEntries = db.next(readerItem, NullWritable.get());
            boolean hasEdits = sortedEdits.next(editItem, NullWritable.get());
            TargetTester targetTester = new TargetTester(pageDb);

            // As long as we have both edits and entries to process,
            // we need to interleave them
            while (hasEntries && hasEdits) {
                int curInstruction = editItem.getInstruction();

                if (curInstruction == ADD_LINK) {
                    //  When we add a link, we may replace a previous
                    //    link with identical URL and MD5 values.  Our
                    //    comparator will test both
                    //
                    int comparison = comparator.compare(readerItem, editItem.getLink());

                    if (comparison < 0) {
                        // Write the readerKey, just passing it along.
                        // Don't process the edit yet.
                        int linkTest = targetTester.hasOutlinks(readerItem.getURL());

                        if (linkTest == LINK_INVALID) {
                            liwriter.appendInstructionInfo(futureEdits, readerItem, DEL_SINGLE_LINK, NullWritable.get());
                            targetOutlinkEdits++;
                        } else {
                            boolean oldOutlinkStatus = readerItem.targetHasOutlink();
                            boolean newOutlinkStatus = (linkTest == HAS_OUTLINKS);
                            // Do the conditional so we minimize unnecessary 
                            // mod-writes.
                            if (oldOutlinkStatus != newOutlinkStatus) {
                                readerItem.setTargetHasOutlink(newOutlinkStatus);
                                liwriter.appendInstructionInfo(futureEdits, readerItem, ADD_LINK, NullWritable.get());
                                targetOutlinkEdits++;
                            }
                            newDb.append(readerItem, NullWritable.get());
                            itemsWritten++;
                        }
                        hasEntries = db.next(readerItem, NullWritable.get());
                    } else if (comparison == 0) {
                        // Write the new item, "replacing" the old one.
                        // We move to the next edit instruction and move
                        //    past the replaced db entry.
                        Link editLink = editItem.getLink();
                        int linkTest = targetTester.hasOutlinks(editLink.getURL());

                        // Delete the edit/readerItem from the other table if it's
                        // found to be invalid.
                        if (linkTest == LINK_INVALID) {
                            liwriter.appendInstructionInfo(futureEdits, editLink, DEL_SINGLE_LINK, NullWritable.get());
                        } else {
                            editLink.setTargetHasOutlink(linkTest == HAS_OUTLINKS);
                            liwriter.appendInstructionInfo(futureEdits, editLink, ADD_LINK, NullWritable.get());

                            newDb.append(editLink, NullWritable.get());
                            itemsWritten++;
                        }
                        targetOutlinkEdits++;

                        hasEntries = db.next(readerItem, NullWritable.get());
                        hasEdits = sortedEdits.next(editItem, NullWritable.get());
                    } else if (comparison > 0) {
                        // Write the new item.  We stay at the current
                        // db entry.
                        Link editLink = editItem.getLink();
                        int linkTest = targetTester.hasOutlinks(editLink.getURL());

                        // Delete the edit from the other table if it's invalid
                        if (linkTest == LINK_INVALID) {
                            liwriter.appendInstructionInfo(futureEdits, editLink, DEL_SINGLE_LINK, NullWritable.get());
                        } else {
                            editLink.setTargetHasOutlink(linkTest == HAS_OUTLINKS);
                            liwriter.appendInstructionInfo(futureEdits, editLink, ADD_LINK, NullWritable.get());
                            newDb.append(editLink, NullWritable.get());
                            itemsWritten++;
                        }
                        targetOutlinkEdits++;

                        hasEdits = sortedEdits.next(editItem, NullWritable.get());
                    }
                } else if (curInstruction == DEL_LINK) {
                    // When we delete a link, we do it by MD5 and apply
                    //   it to the index first.  A single delete instruction
                    //   may remove many items in the db, during the earlier
                    //   processing.  However, unlike the index-processing stage,
                    //   here we can expect a new DEL instruction for every 
                    //   item that we remove from the db.
                    //
                    int comparison = comparator.compare(readerItem, editItem.getLink());

                    if (comparison < 0) {
                        // Write readerKey, just passing it along.  Don't
                        //   process the edit yet.
                        int linkTest = targetTester.hasOutlinks(readerItem.getURL());

                        // Delete the reader item if it's found to be invalid
                        if (linkTest == LINK_INVALID) {
                            liwriter.appendInstructionInfo(futureEdits, readerItem, DEL_SINGLE_LINK, NullWritable.get());
                        } else {
                            readerItem.setTargetHasOutlink(linkTest == HAS_OUTLINKS);
                            liwriter.appendInstructionInfo(futureEdits, readerItem, ADD_LINK, NullWritable.get());
                            newDb.append(readerItem, NullWritable.get());
                            itemsWritten++;
                        }
                        targetOutlinkEdits++;

                        hasEntries = db.next(readerItem, NullWritable.get());
                    } else if (comparison == 0) {
                        // "Delete" the item by passing by the readerKey.
                        // We want a new entry, as well as the next instruction
                        // to process.
                        hasEntries = db.next(readerItem, NullWritable.get());
                        hasEdits = sortedEdits.next(editItem, NullWritable.get());
                    } else if (comparison > 0) {
                        // Ignore, move on to next instruction
                        hasEdits = sortedEdits.next(editItem, NullWritable.get());
                    }
                }
            }

            // Now we have only edits.  No more preexisting items!
            while (! hasEntries && hasEdits) {
                int curInstruction = editItem.getInstruction();

                if (curInstruction == ADD_LINK) {
                    //
                    // Add the item from the edit list.
                    //

                    //
                    // Make sure the outlinks flag is set properly.
                    //
                    Link editLink = editItem.getLink();
                    int linkTest = targetTester.hasOutlinks(editLink.getURL());
                    if (linkTest == LINK_INVALID) {
                        liwriter.appendInstructionInfo(futureEdits, editLink, DEL_SINGLE_LINK, NullWritable.get());
                    } else {
                        editLink.setTargetHasOutlink(linkTest == HAS_OUTLINKS);
                        liwriter.appendInstructionInfo(futureEdits, editLink, ADD_LINK, NullWritable.get());
                        newDb.append(editLink, NullWritable.get());
                        itemsWritten++;
                    }
                    targetOutlinkEdits++;
                } else if (curInstruction == DEL_LINK) {
                    // Ignore operation
                }
                // Move on to next edit
                hasEdits = sortedEdits.next(editItem, NullWritable.get());
            }

            // Now we have only preexisting items.  Just copy them
            // to the new file, in order.
            while (hasEntries && ! hasEdits) {
                //
                // Simply copy the remaining database items.
                //

                //
                // First, make sure the 'outlinks' flag is set properly.
                //
                int linkTest = targetTester.hasOutlinks(readerItem.getURL());
                if (linkTest == LINK_INVALID) {
                    liwriter.appendInstructionInfo(futureEdits, readerItem, DEL_SINGLE_LINK, NullWritable.get());
                    targetOutlinkEdits++;
                } else {
                    boolean oldOutlinkStatus = readerItem.targetHasOutlink();
                    boolean newOutlinkStatus = (linkTest == HAS_OUTLINKS);
                    if (oldOutlinkStatus != newOutlinkStatus) {
                        readerItem.setTargetHasOutlink(newOutlinkStatus);
                        liwriter.appendInstructionInfo(futureEdits, readerItem, ADD_LINK, NullWritable.get());
                        targetOutlinkEdits++;
                    }

                    // Now copy the object
                    newDb.append(readerItem, NullWritable.get());
                    itemsWritten++;
                }

                // Move on to next
                hasEntries = db.next(readerItem, NullWritable.get());
            }

            targetTester.close();
        }
    }

    /**
     * Method useful for the first time we create a distributed db project.
     * Basically need to write down the number of dirs we can expect.
     */
    public static void createDB(NutchFileSystem nutchfs, String dbName, int totalMachines) throws IOException {
        //
        // Check to see if the db already exists
        //
        NutchFile machineInfo = new NutchFile(nutchfs, dbName, "standard", new File("machineinfo"));
        if (nutchfs.get(machineInfo, LONG_TIMEOUT) != null) {
            throw new IOException("Cannot create WebDB at nutchfs " + nutchfs + " with name " + dbName + ", as it already exists.");
        }

        //
        // Write down how many machines live in the distributed pool
        //
        File machineInfoFile = nutchfs.getWorkingFile();
        DataOutputStream out = new DataOutputStream(new FileOutputStream(machineInfoFile));
        try {
            out.write(MACHINE_INFO_VERSION);
            out.writeInt(totalMachines);
        } finally {
            out.close();
        }
        nutchfs.put(machineInfo, machineInfoFile, true);

        //
        // Create the lower directory structures for each machine in pool.
        //
        for (int i = 0; i < totalMachines; i++) {
            NutchFile dbDir = new NutchFile(nutchfs, dbName, "standard", new File("webdb"));
            NutchFile sectionDir = new NutchFile(dbDir, "dbsection." + i);
            NutchFile pagesByURLNF = new NutchFile(sectionDir, PAGES_BY_URL);
            NutchFile pagesByMD5NF = new NutchFile(sectionDir, PAGES_BY_MD5);
            NutchFile linksByURLNF = new NutchFile(sectionDir, LINKS_BY_URL);
            NutchFile linksByMD5NF = new NutchFile(sectionDir, LINKS_BY_MD5);

            File pagesByURLFile = nutchfs.getWorkingFile();
            File pagesByMD5File = nutchfs.getWorkingFile();
            File linksByURLFile = nutchfs.getWorkingFile();
            File linksByMD5File = nutchfs.getWorkingFile();

            //
            // If we're creating the db, we make a zero-length file for each
            // db file
            //
            new MapFile.Writer(pagesByURLFile.getPath(), new UTF8.Comparator(), Page.class).close();
            new MapFile.Writer(pagesByMD5File.getPath(), new Page.Comparator(), NullWritable.class).close();
            new MapFile.Writer(linksByURLFile.getPath(), new Link.UrlComparator(), NullWritable.class).close();
            new MapFile.Writer(linksByMD5File.getPath(), new Link.MD5Comparator(), NullWritable.class).close();

            nutchfs.put(pagesByURLNF, pagesByURLFile, true);
            nutchfs.put(pagesByMD5NF, pagesByMD5File, true);
            nutchfs.put(linksByURLNF, linksByURLFile, true);
            nutchfs.put(linksByMD5NF, linksByMD5File, true);
        }

        //
        // Create the "ready-to-use" flag that tells all subsequent 
        // WebDBWriters it's OK to proceed.
        //
        File readyToUseFile = nutchfs.getWorkingFile();
        NutchFile readyToUse = new NutchFile(nutchfs, dbName, "standard", new File("readyToUse"));
        out = new DataOutputStream(new FileOutputStream(readyToUseFile));
        try { 
            out.writeInt(READY_TO_USE); // Magic number
        } finally {
            out.close();
        }
        nutchfs.put(readyToUse, readyToUseFile, false);
    }

    PageInstructionWriter piwriter = new PageInstructionWriter();
    LinkInstructionWriter liwriter = new LinkInstructionWriter();
    DataInputBuffer inBuf = new DataInputBuffer();
    DataOutputBuffer outBuf = new DataOutputBuffer();

    NutchFileSystem nutchfs;
    String dbName;
    NutchFile dbDir, oldDbDir, newDbDir, tmpDir;
    NutchFile localWriteLock, globalWriteLock, closeCounter, openCounter;

    EditSectionGroupWriter pagesByURLWriter, pagesByMD5Writer, linksByURLWriter, linksByMD5Writer;
    MapFile.Reader pagesByURL, pagesByMD5, linksByURL, linksByMD5;
    long pagesByURLEdits = 0, pagesByMD5Edits = 0, linksByURLEdits = 0, linksByMD5Edits = 0, targetOutlinkEdits = 0;
    int machineNum, totalMachines;


    /**
     * Open the db files.
     */
    public DistributedWebDBWriter(NutchFileSystem nutchfs, String dbName, int machineNum) throws IOException {
        //
        // Store the nutchfs.  Build dir set.
        //
        this.nutchfs = nutchfs;
        this.dbName = dbName;
        this.machineNum = machineNum;
        this.dbDir = new NutchFile(nutchfs, dbName, "standard", new File("webdb"));
        this.oldDbDir = new NutchFile(nutchfs, dbName, "standard", new File("webdb.old"));
        this.newDbDir = new NutchFile(nutchfs, dbName, "standard", new File("webdb.new"));
        this.tmpDir = new NutchFile(newDbDir, "tmp");

        // 
        // Wait indefinitely for "ready-to-use-flag".
        // 
        NutchFile readyToUse = new NutchFile(nutchfs, dbName, "standard", new File("readyToUse"));
        nutchfs.get(readyToUse);

        //////////////////////////////////////////////////////////
        // Locks
        //////////////////////////////////////////////////////////

        // 1. Each dbsection has a lock so only one writer ever accesses
        // it at once.  Lock the local one immediately.
        this.localWriteLock = new NutchFile(nutchfs, dbName, "standard", new File("sectionLock." + machineNum));
        nutchfs.lock(localWriteLock, true);

        // 2.  A global writeLock protects writers that need to make
        // changes that affect many processors (such as moving dbDir or
        // deleting tmp).
        //
        // Readers will obtain this lock non-exclusively.  When it comes
        // time for global changes to the db, writers will obtain it
        // exclusively.  Readers need to leave before these changes can
        // be made.
        this.globalWriteLock = new NutchFile(nutchfs, dbName, "standard", new File("globalWriteLock"));

        // 3.  Not quite a lock, but related: the closeCounter, which
        // tracks how many processors have made it through the db close
        // sequence.  This is protected by globalWriteLock.
        this.openCounter = new NutchFile(newDbDir, "openCounter");
        this.closeCounter = new NutchFile(newDbDir, "closeCounter");


        //////////////////////////////////////////////////////////
        // Setup and Initialization
        //////////////////////////////////////////////////////////

        // Load # of machines
        NutchFile machineInfo = new NutchFile(nutchfs, dbName, "standard", new File("machineinfo"));
        File machineInfoFile = nutchfs.get(machineInfo);
        DataInputStream in = new DataInputStream(new FileInputStream(machineInfoFile));
        try {
            in.read();  // version
            this.totalMachines = in.readInt();
        } finally {
            in.close();
        }

        //
        // Seize global lock
        //
        nutchfs.lock(globalWriteLock, true);

        // Now we use these locks to resolve any partially-completed
        // state directories from a previous run.
        // REMIND - mjc - Fixing/defining the db/newdb and tmp-delete
        // sequence is the most important next step!
        /***
            File oldDbDirFile = nutchfs.get(oldDbDir, SHORT_TIMEOUT);
            if (oldDbDirFile != null) {
            File dbDirFile = nutchfs.get(dbDir, SHORT_TIMEOUT);
            if (dbDirFile != null) {
            throw new IOException("Impossible condition: directories " + oldDbDir + " and " + dbDir + " cannot exist simultaneously");
            }

            File newDbDirFile = nutchfs.get(newDbDir, SHORT_TIMEOUT);
            if (newDbDirFile != null) {
            nutchfs.renameTo(newDbDir, dbDir);
            }
            nutchfs.delete(oldDbDir);
            } else {
            File newDbDirFile = nutchfs.get(newDbDir, SHORT_TIMEOUT);
            if (newDbDirFile != null) {
            nutchfs.delete(newDbDir);
            }
            }

            // Delete any partial edits from last time.
            if (nutchfs.get(tmpDir, LONG_TIMEOUT) != null) {
            nutchfs.delete(tmpDir);
            }
        ****/

        // Load how many machines have started yet.  If we're the
        // first one, then we have to create the EditSectionWriter
        // structures.
        int numOpens = 0;
        File openCounterFile = nutchfs.get(openCounter, LONG_TIMEOUT);
        if (openCounterFile != null) {
            in = new DataInputStream(new FileInputStream(openCounterFile));
            try {
                in.read();  // version
                numOpens = in.readInt();
            } finally {
                in.close();
            }
        } else {
            openCounterFile = nutchfs.getWorkingFile();
        }

        // Bump number by 1.
        DataOutputStream out = new DataOutputStream(new FileOutputStream(openCounterFile));
        try {
            out.write(OPEN_COUNTER_VERSION);
            out.writeInt(numOpens + 1);
        } finally {
            out.close();
        }
        nutchfs.put(openCounter, openCounterFile, true);
        
        // Check if we're the first ones to open.
        if (numOpens == 0) {
            // Build an edit-section for each of the 4 edit types
            EditSectionGroupWriter.createEditGroup(nutchfs, dbName, PAGES_BY_URL, totalMachines, EditSectionGroupWriter.URL_KEYSPACE);
            EditSectionGroupWriter.createEditGroup(nutchfs, dbName, PAGES_BY_MD5, totalMachines, EditSectionGroupWriter.MD5_KEYSPACE);
            EditSectionGroupWriter.createEditGroup(nutchfs, dbName, LINKS_BY_URL, totalMachines, EditSectionGroupWriter.URL_KEYSPACE);
            EditSectionGroupWriter.createEditGroup(nutchfs, dbName, LINKS_BY_MD5, totalMachines, EditSectionGroupWriter.MD5_KEYSPACE);

            // Remove the flag that tells readers it's OK to proceed
            NutchFile dirIsComplete = new NutchFile(dbDir, "dbIsComplete");
            nutchfs.delete(dirIsComplete);
        }

        // These are the NutchFiles for this section of the read-only
        // db.
        NutchFile sectionDir = new NutchFile(dbDir, "dbsection." + machineNum);
        NutchFile pagesByURLNF = new NutchFile(sectionDir, PAGES_BY_URL);
        NutchFile pagesByMD5NF = new NutchFile(sectionDir, PAGES_BY_MD5);
        NutchFile linksByURLNF = new NutchFile(sectionDir, LINKS_BY_URL);
        NutchFile linksByMD5NF = new NutchFile(sectionDir, LINKS_BY_MD5);

        //
        // Release the global lock
        //
        nutchfs.release(globalWriteLock);

        // Create Readers for the above NutchFiles
        this.pagesByURL = new MapFile.Reader(nutchfs.get(pagesByURLNF).getPath(), new UTF8.Comparator());
        this.pagesByMD5 = new MapFile.Reader(nutchfs.get(pagesByMD5NF).getPath(), new Page.Comparator());
        this.linksByURL = new MapFile.Reader(nutchfs.get(linksByURLNF).getPath(), new Link.UrlComparator());
        this.linksByMD5 = new MapFile.Reader(nutchfs.get(linksByMD5NF).getPath(), new Link.MD5Comparator());

        // Create writers for new edit-files.  We write changes
        // into these files, then apply them to the db upon close().
        this.pagesByURLWriter = new EditSectionGroupWriter(nutchfs, dbName, machineNum, totalMachines, PAGES_BY_URL, PageInstruction.class, NullWritable.class, new EditSectionGroupWriter.PageURLExtractor());
        this.pagesByMD5Writer = new EditSectionGroupWriter(nutchfs, dbName, machineNum, totalMachines, PAGES_BY_MD5, PageInstruction.class, NullWritable.class, new EditSectionGroupWriter.PageMD5Extractor());
        this.linksByURLWriter = new EditSectionGroupWriter(nutchfs, dbName, machineNum, totalMachines, LINKS_BY_URL, LinkInstruction.class, NullWritable.class, new EditSectionGroupWriter.LinkURLExtractor());
        this.linksByMD5Writer = new EditSectionGroupWriter(nutchfs, dbName, machineNum, totalMachines, LINKS_BY_MD5, LinkInstruction.class, NullWritable.class, new EditSectionGroupWriter.LinkMD5Extractor());
    }

    /**
     * Shutdown
     */
    public synchronized void close() throws IOException {
        // Process the 4 tables:
        // 1. pagesByURL
        // 2. pagesByMD5
        // 3. linksByMD5
        // 4. linksByURL

        // 1. Process pagesByURL.  Processing this stream will
        // generate a number of edits for the pagesByMD5 step.
        //
        CloseProcessor pagesByURLProcessor = new PagesByURLProcessor(pagesByURL, pagesByURLWriter, pagesByMD5Writer);
        long numPBUItems = pagesByURLProcessor.closeDown(tmpDir, newDbDir);

        //
        // 2.  Process the pagesByMD5 edit stream.  This will
        // make calls to deleteLink(), which are processed later.
        //
        CloseProcessor pagesByMD5Processor = new PagesByMD5Processor(pagesByMD5, pagesByMD5Writer);
        long numPBMItems = pagesByMD5Processor.closeDown(tmpDir, newDbDir);

        //
        // 3. Process the linksByMD5 edit stream first.  This
        // will generate a number of edits for the linksByURL
        // stream.  This also processes the calls to deleteLink()
        // that may have been invoked as part of the above call
        // to process pagesByMD5.
        CloseProcessor linksByMD5Processor = new LinksByMD5Processor(linksByMD5, linksByMD5Writer, linksByURLWriter);
        long numLBMItems = linksByMD5Processor.closeDown(tmpDir, newDbDir);

        //
        // 4. Process the linksByURL edit stream.  This will also
        // read through the sorted PagesByURL file, and modify
        // the Links so that they indicated whether the target
        // Page has any outlinks or not.
        //

        // Duplicate the LINKS_BY_MD5 editsWriter, because the 1st one has
        // already been closed.
        EditSectionGroupWriter targetOutlinkEditsWriter = new EditSectionGroupWriter(nutchfs, dbName, machineNum, totalMachines, LINKS_BY_MD5, LinkInstruction.class, NullWritable.class, new EditSectionGroupWriter.LinkMD5Extractor());

        // Find the just-written dbsection output for PAGES_BY_URL
        NutchFile newSectionDir = new NutchFile(newDbDir, "dbsection." + machineNum);
        NutchFile newPagesByURLNF = new NutchFile(newSectionDir, PAGES_BY_URL);

        CloseProcessor linksByURLProcessor = new LinksByURLProcessor(linksByURL, linksByURLWriter, new MapFile.Reader(nutchfs.get(newPagesByURLNF).getPath(), new UTF8.Comparator()), targetOutlinkEditsWriter);
        long numLBUItems = linksByURLProcessor.closeDown(tmpDir, newDbDir);

        //
        // If the number of linksByURL processed is zero, then
        // there's no reason to do all of the following with
        // a 2nd pass through linksByMD5.
        //
        if (numLBUItems != 0) {
            //
            // 5. Step 4 did several things to the LinksByURL db.
            // First, it implemented all the changes generated
            // by instructions from LinksByMD5Processor.  Second,
            // it made lots of calls to setTargetHasOutlink.  This
            // changes the content of the Link objects.
            //
            // So now we need to reconstruct the LinksByMD5
            // list, using the Links we created in step #4.
            //
            NutchFile newLinksByMD5NF = new NutchFile(newSectionDir, LINKS_BY_MD5);
            MapFile.Reader linksByMD5ForStageTwo = new MapFile.Reader(nutchfs.get(newLinksByMD5NF).getPath(), new Link.MD5Comparator());

            NutchFile stageTwoDbDir = new NutchFile(newDbDir, "stage2.subdir");
            CloseProcessor linksByMD5StageTwoProcessor = new LinksByMD5Processor(linksByMD5ForStageTwo, targetOutlinkEditsWriter, null);
            numLBMItems = linksByMD5StageTwoProcessor.closeDown(tmpDir, stageTwoDbDir);

            //
            // 6. Now move the Stage2 LinksByMD5 file up to replace
            // the one at the primary level
            //
            linksByMD5ForStageTwo.close();
            NutchFile stageOneLinksByMD5 = new NutchFile(newDbDir, LINKS_BY_MD5);
            NutchFile stageTwoLinksByMD5 = new NutchFile(stageTwoDbDir, LINKS_BY_MD5);
            nutchfs.delete(stageOneLinksByMD5);
            nutchfs.renameTo(stageTwoLinksByMD5, stageOneLinksByMD5);
        }

        //
        // 7. Finally, write out the total num of pages and links
        //
        //NutchFile newSectionDir = new NutchFile(newDbDir, "dbsection." + machineNum);
        NutchFile sectionStats = new NutchFile(newSectionDir, STATS_FILE);
        File sectionStatsFile = nutchfs.getWorkingFile();
        DataOutputStream out = new DataOutputStream(new FileOutputStream(sectionStatsFile));
        try {
            //
            // These counts are guaranteed to be correct; they're
            // based on the counts made during processing of primary-key
            // edits.  Pages are always counted by URL first, and only
            // subsequently by MD5 if there are any edits to make.  Links
            // are always counted by MD5 first, and only by URL subsequently
            // and conditionally.  
            //
            // If there are a bunch of edits that result in no modifications
            // to the db, the two sets of counts (one for URL, one for
            // MD5) could become out of sync.  So we use the ones that
            // are sure to be accurate.
            //
            out.write(CUR_VERSION);
            out.writeLong(numPBUItems);
            out.writeLong(numLBMItems);
        } finally {
            out.close();
            nutchfs.put(sectionStats, sectionStatsFile, true);
        }

        // Close down the db-readers
        pagesByURL.close();
        pagesByMD5.close();
        linksByMD5.close();
        linksByURL.close();

        //////////////////////////////////////////////////////////////
        // Now we need to do a distributed-close.  It works by
        // the "last person out turns off the lights" protocol.
        // All the processors but one will exit without doing anything.
        // The last one to exit does all the directory moves.
        //////////////////////////////////////////////////////////////

        // 
        // First step is to obtain the global writeLock exclusively.  
        // DBReaders will try to obtain this non-exclusively.  That 
        // way, there can be many readers at once, but these must 
        // leave before a single process can blow away the directories.
        //
        nutchfs.lock(globalWriteLock, true);

        //
        // Read in how many processes have closed already
        //
        int numCloses = 0;
        File closeCounterFile = nutchfs.get(closeCounter, LONG_TIMEOUT);
        if (closeCounterFile != null) {
            DataInputStream in = new DataInputStream(new FileInputStream(closeCounterFile));
            try {
                in.read();  // version
                numCloses = in.readInt();
            } finally {
                in.close();
            }
        } else {
            closeCounterFile = nutchfs.getWorkingFile();
        }
        if (numCloses == totalMachines) {
            throw new IOException("All the processors have already shut down.  Impossible condition!");
        }
        
        // Bump that number by 1.
        out = new DataOutputStream(new FileOutputStream(closeCounterFile));
        try {
            out.write(CLOSE_COUNTER_VERSION);
            out.writeInt(numCloses + 1);
        } finally {
            out.close();
        }
        nutchfs.put(closeCounter, closeCounterFile, true);

        // Check if this processor is the last one to close.
        if (numCloses == totalMachines - 1) {
            // Delete edits that might still be lingering around...
            for (int i = 0; i < totalMachines; i++) {
                new EditSectionGroupReader(nutchfs, dbName, PAGES_BY_URL, i, totalMachines).delete();
                new EditSectionGroupReader(nutchfs, dbName, PAGES_BY_MD5, i, totalMachines).delete();
                new EditSectionGroupReader(nutchfs, dbName, LINKS_BY_URL, i, totalMachines).delete();
                new EditSectionGroupReader(nutchfs, dbName, LINKS_BY_MD5, i, totalMachines).delete();
            }
            
            // Complete directories and move them into place
            nutchfs.completeDir(tmpDir);
            nutchfs.completeDir(dbDir);
            nutchfs.completeDir(newDbDir);

            //
            // Write out the "complete" flag, which tells
            // readers it's OK to proceed
            //
            File dirIsCompleteFile = nutchfs.getWorkingFile();
            NutchFile dirIsComplete = new NutchFile(newDbDir, "dbIsComplete");
            out = new DataOutputStream(new FileOutputStream(dirIsCompleteFile));
            try {
                out.writeInt(IS_COMPLETE);  // Magic number
            } finally {
                out.close();
            }
            nutchfs.put(dirIsComplete, dirIsCompleteFile, true);

            // Here we need to 'finish' the db operation.
            // That involves: 1. Removing the tmpdir.
            //                2. Moving the dbDir to oldDbDir
            //                3. Renaming the newDbDir to dbDir
            //                4. Removing the oldDbDir
            //

            // 1.
            nutchfs.delete(tmpDir);

            // 2.
            nutchfs.renameTo(dbDir, oldDbDir);
            
            // 3.
            nutchfs.renameTo(newDbDir, dbDir);

            // 4.
            nutchfs.delete(oldDbDir);
        }

        // Done.
        nutchfs.release(globalWriteLock);
        nutchfs.release(localWriteLock);
    }

    /////////////////////
    // Methods for adding, and managing, db operations
    ////////////////////

    /**
     * Add a page to the page database
     */
    public synchronized void addPage(Page page) throws IOException {
        // The 2nd (byMD5) part is handled during processing of the 1st.
        pagesByURLEdits++;
        piwriter.appendInstructionInfo(pagesByURLWriter, page, ADD_PAGE, NullWritable.get());
    }

    /**
     * Add a page to the page database, with a brand-new score
     */
    public synchronized void addPageWithScore(Page page) throws IOException {
        // The 2nd (byMD5) part is handled during processing of the 1st.
        pagesByURLEdits++;
        piwriter.appendInstructionInfo(pagesByURLWriter, page, ADD_PAGE_WITH_SCORE, NullWritable.get());
    }

    /**
     * Don't replace the one in the database, if there is one.
     */
    public synchronized void addPageIfNotPresent(Page page) throws IOException {
        // The 2nd (index) part is handled during processing of the 1st.
        pagesByURLEdits++;
        piwriter.appendInstructionInfo(pagesByURLWriter, page, ADD_PAGE_IFN_PRESENT, NullWritable.get());        
    }

    /**
     * Don't replace the one in the database, if there is one.
     *
     * If we do insert the new Page, then we should also insert
     * the given Link object.
     */
    public synchronized void addPageIfNotPresent(Page page, Link link) throws IOException {
        // The 2nd (index) part is handled during processing of the 1st.
        pagesByURLEdits++;
        piwriter.appendInstructionInfo(pagesByURLWriter, page, link, ADD_PAGE_IFN_PRESENT, NullWritable.get());        
    }

    /**
     * Remove a page from the page database.
     */
    public synchronized void deletePage(String url) throws IOException {
        // The 2nd (index) part is handled during processing of the 1st.
        Page p = new Page();
        p.setURL(url);
        pagesByURLEdits++;        
        piwriter.appendInstructionInfo(pagesByURLWriter, p, DEL_PAGE, NullWritable.get());
    }

    /**
     * Add a link to the link database
     */
    public synchronized void addLink(Link lr) throws IOException {
        linksByMD5Edits++;
        liwriter.appendInstructionInfo(linksByMD5Writer, lr, ADD_LINK, NullWritable.get());
    }

    /**
     * Remove links with the given MD5 from the db.
     */
    private synchronized void deleteLink(MD5Hash md5) throws IOException {
        linksByMD5Edits++;
        liwriter.appendInstructionInfo(linksByMD5Writer, new Link(md5, 0, "", ""), DEL_LINK, NullWritable.get());
    }

    /**
     * The WebDBWriter.main() provides some handy methods for
     * testing the WebDB.
     */
    public static void main(String argv[]) throws FileNotFoundException, IOException {
        if (argv.length < 2) {
            System.out.println("Usage: java net.nutch.db.DistributedWebDBWriter <db> [-create <numProcessors>] | <machineInt> ([-addpage id url] | [-addpageifnp id url] | [-deletepage url] | [-addlink fromID url] | [-deletelink fromID])");
            return;
        }

        NutchFileSystem nutchfs = new NutchNFSFileSystem(new File(argv[0]), true);
        if ("-create".equals(argv[1])) {
            DistributedWebDBWriter.createDB(nutchfs, "db", Integer.parseInt(argv[2]));
            System.out.println("Created webdb at " + argv[0]);
        } else {
            int machineNum = Integer.parseInt(argv[1]);
            String cmd = argv[2];

            if ("-addpage".equals(cmd)) {
                MD5Hash md5 = new MD5Hash(argv[3]);
                String url = argv[4];

                DistributedWebDBWriter writer = new DistributedWebDBWriter(nutchfs, "db", machineNum);
                Page page = new Page(url, md5);
                writer.addPageWithScore(page);
                System.out.println("Added page (with score): " + page);
                writer.close();
            } else if ("-addpageifnp".equals(cmd)) {
                MD5Hash md5 = new MD5Hash(argv[3]);
                String url = argv[4];

                DistributedWebDBWriter writer = new DistributedWebDBWriter(nutchfs, "db", machineNum);
                try {
                    Page page = new Page(url, md5);
                    writer.addPageIfNotPresent(page);
                    System.out.println("Added page: " + page);
                } finally {
                    writer.close();
                } 
            } else if ("-deletepage".equals(cmd)) {
                String url = argv[3];
                DistributedWebDBWriter writer = new DistributedWebDBWriter(nutchfs, "db", machineNum);

                try {
                    writer.deletePage(url.trim());
                    System.out.println("Deleted item(s)");
                } finally {
                    writer.close();
                }
            } else if ("-addlink".equals(cmd)) {
                MD5Hash fromID = new MD5Hash(argv[3]);
                String url = argv[4];
                DistributedWebDBWriter writer = new DistributedWebDBWriter(nutchfs, "db", machineNum);

                try {
                    Link link = new Link(fromID, MD5Hash.digest("randomstring.com").halfDigest(), url, "SomeRandomAnchorText_" + System.currentTimeMillis());
                    writer.addLink(link);
                    System.out.println("Added link: " + link);
                } finally {
                    writer.close();
                }
            } else if ("-deletelink".equals(cmd)) {
                MD5Hash fromID = new MD5Hash(argv[3]);

                DistributedWebDBWriter writer = new DistributedWebDBWriter(nutchfs, "db", machineNum);
                try {
                    writer.deleteLink(fromID);
                    System.out.println("Deleted item(s)");
                } finally {
                    writer.close();
                }
            } else {
                System.out.println("Sorry, no command with name " + argv[1]);
            }
        }
    }
}